Model 1: Continuous Unconditioned Generation¶

Imports and Constants¶

In [ ]:
import os
import random
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import torch
import torchaudio
import soundfile as sf
from IPython.display import Audio

def dummy_npwarn_decorator_factory():
  def npwarn_decorator(x):
    return x
  return npwarn_decorator
np._no_nep50_warning = getattr(np, '_no_nep50_warning', dummy_npwarn_decorator_factory)
In [ ]:
SAMPLE_RATE = 44100
EXPORT = './export'
random.seed(24)
In [ ]:
torch.cuda.empty_cache()

Exploratory Data Analysis and Preprocessing¶

Our data comes from various sources:

  • www.youtube.com/watch?v=lTRiuFIWV54
  • archive.org/details/deep-spot-city-of-gamers-chill-gaming-studying-lofi-hip-hop-mix-1-hour
  • https://archive.org/details/lo-fi-for-reading-manga-pb8ljm/Mabisyo+-+Lo%E2%80%8B-%E2%80%8BFi+For+Reading+Manga+-+04+Monkey+D.+Luffy.flac
  • https://archive.org/details/motel-smoke-at-2-am
  • https://archive.org/details/kalaido-hanging-lanterns_202101/flovry+-+car+radio.wav

With the help of some chatGPT code I converted all of the clips to .wav and split them into 11 second chunks. Uncomment and run todo preprocessing on new clips

In [ ]:
# # code adjusted from chatGPT with prompts "write me a snippet of code to convert flac, wav, mp3, and m4a into 11 second long wav files with a sample rate of 44100 Hz", "write me a snippet of code to convert flac, wav, mp3, and m4a into 11 second long wav files with a sample rate of 44100 Hz", "discard the final chunk instead of padding=", "can you use torchaudio instead of pydub"

# # Settings
# input_dir = "./raw_audio/unfinished"
# output_dir = "./train_data"
# target_sr = 44100
# chunk_duration_s = 11  # 11 seconds
# chunk_duration_samples = target_sr * chunk_duration_s  # 11 seconds in samples

# os.makedirs(output_dir, exist_ok=True)
# valid_exts = (".flac", ".wav", ".mp3", ".m4a")

# def load_audio(file_path, sr):
#     """
#     Load audio using torchaudio for supported formats or librosa for others.
#     """
#     ext = file_path.lower().split('.')[-1]

#     if ext in ['flac', 'wav']:
#         waveform, original_sr = torchaudio.load(file_path)
#     else:
#         waveform, original_sr = librosa.load(file_path, sr=None)  # Load with original sampling rate
#         waveform = torch.tensor(waveform)  # Convert to torch tensor

#     if original_sr != sr:
#         # Resample if needed
#         resampler = torchaudio.transforms.Resample(orig_freq=original_sr, new_freq=sr)
#         waveform = resampler(waveform)

#     return waveform

# def split_audio(waveform, chunk_length_samples):
#     """
#     Split the waveform into 11-second chunks.
#     """
#     num_chunks = waveform.size(1) // chunk_length_samples
#     return [waveform[:, i * chunk_length_samples:(i + 1) * chunk_length_samples] for i in range(num_chunks)]

# for filename in os.listdir(input_dir):
#     if filename.lower().endswith(valid_exts):
#         input_path = os.path.join(input_dir, filename)
#         base_name = os.path.splitext(filename)[0]

#         try:
#             # Load audio
#             waveform = load_audio(input_path, target_sr)

#             # Split into full-length chunks
#             chunks = split_audio(waveform, chunk_duration_samples)

#             # Save each chunk as a .wav file
#             for i, chunk in enumerate(chunks):
#                 chunk_filename = f"{base_name}_chunk{i+1:03d}.wav"
#                 chunk_path = os.path.join(output_dir, chunk_filename)

#                 # Save chunk
#                 torchaudio.save(chunk_path, chunk, target_sr)
#                 print(f"✅ Saved: {chunk_filename}")

#         except Exception as e:
#             print(f"❌ Error processing {filename}: {e}")

Spectrogram conversion¶

In [ ]:
## Code adjusted from chatGPT prompted with "using librosa loop through a batch of wav files and give me their spectrograms"
## Directory containing WAV files
#wav_dir = "./train_data"
#out_dir = "./train_data_spectrograms"
#wav_files = [f for f in os.listdir(wav_dir) if f.endswith(".wav")]

## Number of files to process
#max_files = 1000

#for i, file in enumerate(wav_files[:max_files]):
#    file_path = os.path.join(wav_dir, file)

#    # Load stereo audio (converts to mono by default)
#    y, sr = librosa.load(file_path, sr=SAMPLE_RATE, mono=True)

#    # Compute log-mel spectrogram
#    S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=512, n_mels=128)
#    np.save(os.path.join(out_dir, f'{i}_mel.npy'), S)

Now let's take a look at and listen to the audio generated from these spectrograms

In [ ]:
tensor_dir = "./train_data_spectrograms"


tensor_files = [f for f in os.listdir(tensor_dir) if f.endswith(".npy")]

max_samples = 4

for file in random.sample(tensor_files, max_samples):
    file_path = os.path.join(tensor_dir, file)

    # Load tensors
    mel = np.load(file_path)
    print(mel.shape)

    # Print spectrogram
    log_S = librosa.power_to_db(mel, ref=np.max)

    # Plot
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(log_S, sr=SAMPLE_RATE, hop_length=512, x_axis='time', y_axis='mel')
    plt.title(f"Log-Mel Spectrogram: {file}")
    plt.colorbar(format="%+2.0f dB")
    plt.tight_layout()
    plt.savefig(os.path.join(EXPORT, f"{file}_Log-Mel.png"))

    # Spectrogram to audio
    y = librosa.feature.inverse.mel_to_audio(mel, sr = SAMPLE_RATE, n_fft=2048, hop_length=512)
    display(Audio(y, rate=SAMPLE_RATE))
    sf.write(os.path.join(EXPORT, f"{file}.wav"), y, SAMPLE_RATE)
(128, 948)
Your browser does not support the audio element.
(128, 948)
Your browser does not support the audio element.
(128, 948)
Your browser does not support the audio element.
(128, 948)
Your browser does not support the audio element.
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Model¶

I am following the guide on training a diffusion model from Diffusers https://huggingface.co/docs/diffusers/v0.33.1/en/tutorials/basic_training

I tested various hyperparameters and found the hyperparameters below to be the best for decently fast training.

In [ ]:
from dataclasses import dataclass

@dataclass
class TrainingConfig:
    image_size = (128, 128) #(128, 256)  # the generated image resolution
    train_batch_size = 8
    eval_batch_size = 8  # how many images to sample during evaluation
    num_epochs = 100
    gradient_accumulation_steps = 1
    learning_rate = 1e-4
    lr_warmup_steps = 500
    save_image_epochs = 20
    save_model_epochs = 20
    mixed_precision = "fp16"  # `no` for float32, `fp16` for automatic mixed precision
    output_dir = "mel-generation"  # the model name locally and on the HF Hub
    overwrite_output_dir = True  # overwrite the old model when re-running the notebook
    seed = 0

    max_samples = -1


config = TrainingConfig()

Dataset Loader¶

In [ ]:
from torch.utils.data import DataLoader, Dataset
In [ ]:
class SpectrogramDataset(Dataset):
    def __init__(self, spectrogram_dir, sample_size=(128, 896), max_samples = -1):
        self.sample_size = sample_size
        self.spectrogram_files = []
        for i, f in enumerate(os.listdir(spectrogram_dir)):
            if max_samples != -1 and max_samples < i :
                break
            if f.endswith(".npy"):
                self.spectrogram_files.append(os.path.join(spectrogram_dir, f))



    def __len__(self):
        return len(self.spectrogram_files)

    def __getitem__(self, idx):
        # If idx is a slice, return a batch of spectrograms
        if isinstance(idx, slice):
            return [self.__getitem__(i) for i in range(*idx.indices(len(self)))]

        # Fetch a single spectrogram and return it with batch dimension
        spectrogram = np.load(self.spectrogram_files[idx])

        # crop to match sample_size (I'm not center cropping bc I see no benefit here)
        target_h, target_w = self.sample_size
        spectrogram = spectrogram[:target_h, :target_w]

        spectrogram = torch.tensor(spectrogram, dtype=torch.float32).unsqueeze(0)  # Add batch dimension
        return spectrogram
dataset = SpectrogramDataset('./train_data_spectrograms', sample_size = config.image_size, max_samples=config.max_samples)
In [ ]:
train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=config.train_batch_size, shuffle=True)
In [ ]:
for i, data in enumerate(dataset[:2]):

    display(data)
tensor([[[4.9801e+01, 9.5735e+01, 8.4604e+01,  ..., 1.9413e+00,
          1.8168e+00, 1.7897e+00],
         [9.9013e+01, 3.3336e+02, 3.9260e+02,  ..., 8.8301e+00,
          8.7300e+00, 8.7972e+00],
         [5.0395e+01, 8.8918e+01, 7.5389e+01,  ..., 2.2583e+00,
          2.5656e+00, 2.1667e+00],
         ...,
         [1.4460e-04, 3.6040e-05, 1.7846e-09,  ..., 1.0010e-09,
          1.1840e-09, 1.4590e-09],
         [1.4301e-04, 3.5646e-05, 1.0841e-09,  ..., 1.1986e-09,
          1.0729e-09, 1.0354e-09],
         [1.4195e-04, 3.5379e-05, 1.2232e-09,  ..., 1.3664e-09,
          1.3587e-09, 1.7225e-09]]])
tensor([[[5.5976e-03, 8.9468e-04, 2.3048e-05,  ..., 2.1976e-05,
          5.0490e-05, 7.8640e-05],
         [7.7224e-03, 1.4194e-03, 1.3512e-04,  ..., 4.6296e-04,
          1.8532e-04, 2.7411e-04],
         [1.2057e-02, 7.2128e-03, 1.4132e-02,  ..., 3.1841e-03,
          1.7689e-03, 8.8070e-04],
         ...,
         [3.7608e-05, 9.3756e-06, 1.4952e-09,  ..., 1.6208e-09,
          1.8723e-09, 1.3833e-09],
         [3.7141e-05, 9.2574e-06, 1.6648e-09,  ..., 1.6661e-09,
          2.0163e-09, 1.9332e-09],
         [3.6892e-05, 9.1962e-06, 1.6135e-09,  ..., 1.3921e-09,
          1.4930e-09, 1.5270e-09]]])
In [ ]:
train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=config.train_batch_size, shuffle=True)

Diffusion Model¶

In [ ]:
from diffusers import UNet2DModel

model = UNet2DModel(
    sample_size=config.image_size,  # the target image resolution
    in_channels=1,  # the number of input channels, 3 for RGB images
    out_channels=1,  # the number of output channels
    layers_per_block=2,  # how many ResNet layers to use per UNet block
    block_out_channels=(128, 948, 256, 256, 512, 512),  # the number of output channels for each UNet block
    down_block_types=(
        "DownBlock2D",  # a regular ResNet downsampling block
        "DownBlock2D",
        "DownBlock2D",
        "DownBlock2D",
        "AttnDownBlock2D",  # a ResNet downsampling block with spatial self-attention
        "DownBlock2D",
    ),
    up_block_types=(
        "UpBlock2D",  # a regular ResNet upsampling block
        "AttnUpBlock2D",  # a ResNet upsampling block with spatial self-attention
        "UpBlock2D",
        "UpBlock2D",
        "UpBlock2D",
        "UpBlock2D",
    ),
    norm_num_groups = 2,
)

# # Load from checkpoint TODO
# from diffusers import DDPMPipeline
# if config.checkpoint:
#     checkpoint_dir = os.path.join(config.output_dir, "checkpoints", config.checkpoint)
#     loaded_pipeline = DDMPipeline.from_pretrained(checkpoint_dir)
#     model = loaded_pipeline.unet
#     noise_scheduler = loaded_pipeline.scheduler
C:\Users\arthurchan\miniconda3\envs\cse153\lib\site-packages\tqdm\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
In [ ]:
sample_image = dataset[0].unsqueeze(0)
print("Input shape:", sample_image.shape)
print("Output shape:", model(sample_image, timestep=0).sample.shape)
Input shape: torch.Size([1, 1, 128, 128])
Output shape: torch.Size([1, 1, 128, 128])
In [ ]:
from PIL import Image
from diffusers import DDPMScheduler

noise_scheduler = DDPMScheduler(num_train_timesteps=1000)
noise = torch.randn(sample_image.shape)
timesteps = torch.LongTensor([50])
noisy_image = noise_scheduler.add_noise(sample_image, noise, timesteps)

noisy_image.shape
Out[ ]:
torch.Size([1, 1, 128, 128])
In [ ]:
# visualize noisy sample
log_S = librosa.power_to_db(noisy_image[0,0,:,:], ref=np.max)

# Plot
plt.figure(figsize=(10, 4))
librosa.display.specshow(log_S, sr=SAMPLE_RATE, hop_length=512, x_axis='time', y_axis='mel')
plt.title("Noisy Log-Mel Spectrogram")
plt.colorbar(format="%+2.0f dB")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
import torch.nn.functional as F

noise_pred = model(noisy_image, timesteps).sample

loss = F.mse_loss(noise_pred, noise)

loss
Out[ ]:
tensor(1.0781, grad_fn=<MseLossBackward0>)

Training¶

In [ ]:
from diffusers.optimization import get_cosine_schedule_with_warmup

optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)

lr_scheduler = get_cosine_schedule_with_warmup(

    optimizer=optimizer,

    num_warmup_steps=config.lr_warmup_steps,

    num_training_steps=(len(train_dataloader) * config.num_epochs),

)
In [ ]:
from diffusers import DDPMPipeline
from diffusers.utils import make_image_grid

def evaluate(config, epoch, pipeline):

    # Sample some images from random noise (this is the backward diffusion process).

    # The default pipeline output type is `List[PIL.Image]`

    images = pipeline(

        batch_size=config.eval_batch_size,

        generator=torch.Generator(device='cpu').manual_seed(config.seed), # Use a separate torch generator to avoid rewinding the random state of the main training loop

        output_type = "",

        # num_inference_steps = 2 # david (undo)

    ).images

    # Save the images
    test_dir = os.path.join(config.output_dir, "samples")

    os.makedirs(test_dir, exist_ok=True)

    for i, image in enumerate(images):
        image = np.squeeze(image)
        np.save(os.path.join(test_dir, f"{epoch}_{i}.npy"), image)
    # image_grid.save(.png")
In [ ]:
from accelerate import Accelerator
from tqdm.auto import tqdm
from pathlib import Path

def train_loop(config, model, noise_scheduler, optimizer, train_dataloader, lr_scheduler):
    # Initialize accelerator and tensorboard logging
    accelerator = Accelerator(
        mixed_precision=config.mixed_precision,
        gradient_accumulation_steps=config.gradient_accumulation_steps,
        log_with="tensorboard",
        project_dir=os.path.join(config.output_dir, "logs"),
    )
    if accelerator.is_main_process:
        assert config.output_dir is not None
        os.makedirs(config.output_dir, exist_ok=True)

        accelerator.init_trackers("train_example")

    # Prepare everything
    # There is no specific order to remember, you just need to unpack the
    # objects in the same order you gave them to the prepare method.
    model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, lr_scheduler
    )

    global_step = 0

    # Now you train the model
    for epoch in range(config.num_epochs):
        progress_bar = tqdm(total=len(train_dataloader), disable=not accelerator.is_local_main_process)
        progress_bar.set_description(f"Epoch {epoch}")

        for step, batch in enumerate(train_dataloader):
            clean_images = batch
            # Sample noise to add to the images
            noise = torch.randn(clean_images.shape, device=clean_images.device)
            bs = clean_images.shape[0]

            # Sample a random timestep for each image
            timesteps = torch.randint(
                0, noise_scheduler.config.num_train_timesteps, (bs,), device=clean_images.device,
                dtype=torch.int64
            )

            # Add noise to the clean images according to the noise magnitude at each timestep
            # (this is the forward diffusion process)
            noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)

            with accelerator.accumulate(model):
                # Predict the noise residual
                noise_pred = model(noisy_images, timesteps, return_dict=False)[0]
                loss = F.mse_loss(noise_pred, noise)
                accelerator.backward(loss)

                if accelerator.sync_gradients:
                    accelerator.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

            progress_bar.update(1)
            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "step": global_step}
            progress_bar.set_postfix(**logs)
            accelerator.log(logs, step=global_step)
            global_step += 1

        # After each epoch you optionally sample some demo images with evaluate() and save the model
        if accelerator.is_main_process:
            pipeline = DDPMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler)

            if (epoch + 1) % config.save_image_epochs == 0 or epoch == config.num_epochs - 1:
                evaluate(config, epoch, pipeline)

            if (epoch + 1) % config.save_model_epochs == 0 or epoch == config.num_epochs - 1:
                pipeline.save_pretrained(os.path.join(config.output_dir, "checkpoints", f"epoch_{epoch + 1}"))

Run training¶

Uncomment and run to train the model

In [ ]:
# from accelerate import notebook_launcher

# args = (config, model, noise_scheduler, optimizer, train_dataloader, lr_scheduler)

# notebook_launcher(train_loop, args, num_processes=1)

Evaluation¶

Below are the audio samples and the spectrograms of the diffusion model at various epochs {epoch}_{sample num}.npy

In [ ]:
sample_dir = "./mel-generation/128x256"
# sample_dir = os.path.join(config.output_dir, "samples")
In [ ]:
nplist_files = [f for f in os.listdir(sample_dir) if f.endswith(".npy")]

max_files = -1

for i, file in enumerate(nplist_files[:]):
    file_path = os.path.join(sample_dir, file)

    # Load tensors
    mel = np.load(file_path)
    print(file)

    log_S = librosa.power_to_db(mel, ref=np.max)

    # Plot
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(log_S, sr=SAMPLE_RATE, hop_length=512, x_axis='time', y_axis='mel')
    plt.title(f"Log-Mel Spectrogram: {file}")
    plt.colorbar(format="%+2.0f dB")
    plt.tight_layout()
    plt.savefig(os.path.join(EXPORT, f"{file}_Log-Mel.png"))

    # Spectrogram to audio
    y = librosa.feature.inverse.mel_to_audio(mel, sr = SAMPLE_RATE, n_fft=2048, hop_length=512)
    display(Audio(y, rate=SAMPLE_RATE))
    sf.write(os.path.join(EXPORT, f"{file}.wav"), y, SAMPLE_RATE)
19_0.npy
Your browser does not support the audio element.
19_1.npy
Your browser does not support the audio element.
19_2.npy
Your browser does not support the audio element.
19_3.npy
Your browser does not support the audio element.
39_0.npy
Your browser does not support the audio element.
39_1.npy
Your browser does not support the audio element.
39_2.npy
Your browser does not support the audio element.
39_3.npy
Your browser does not support the audio element.
59_0.npy
Your browser does not support the audio element.
59_1.npy
Your browser does not support the audio element.
59_2.npy
Your browser does not support the audio element.
59_3.npy
Your browser does not support the audio element.
79_0.npy
Your browser does not support the audio element.
79_1.npy
Your browser does not support the audio element.
79_2.npy
Your browser does not support the audio element.
79_3.npy
Your browser does not support the audio element.
99_0.npy
Your browser does not support the audio element.
99_1.npy
Your browser does not support the audio element.
99_2.npy
Your browser does not support the audio element.
99_3.npy
Your browser does not support the audio element.
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Baseline comparison¶

In [ ]:
baseline = np.random.rand(128,256)
baseline = baseline * SAMPLE_RATE / 2
In [ ]:
log_S = librosa.power_to_db(baseline, ref=np.max)

file = "baseline.npy"

# Plot
plt.figure(figsize=(10, 4))
librosa.display.specshow(log_S, sr=SAMPLE_RATE, hop_length=512, x_axis='time', y_axis='mel')
plt.title(f"Log-Mel Spectrogram: {file}")
plt.colorbar(format="%+2.0f dB")
plt.tight_layout()
plt.savefig(os.path.join(EXPORT, f"{file}_Log-Mel.png"))

# Spectrogram to audio
y = librosa.feature.inverse.mel_to_audio(mel, sr = SAMPLE_RATE, n_fft=2048, hop_length=512)
display(Audio(y, rate=SAMPLE_RATE))
sf.write(os.path.join(EXPORT, f"{file}.wav"), y, SAMPLE_RATE)
Your browser does not support the audio element.
No description has been provided for this image

Model 2: Symbolic Unconditioned Generation¶

Install and load required libraries¶

In [ ]:
# !pip install miditok
# !pip install mido
#!pip install symusic
#!pip install glob
#!pip install torch
In [ ]:
# !unzip data.zip
In [ ]:
from google.colab import files

import glob
import random
from typing import List
from collections import defaultdict

import numpy as np
from numpy.random import choice

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from mido import MidiFile
from symusic import Score
from miditok import REMI, TokenizerConfig

Exploratory Data Analysis¶

In [ ]:
train_files = glob.glob("./data/train/*.mid")
test_files = glob.glob("./data/test/*.mid")
In [ ]:
def get_midi_len(file):
  try:
    mid = MidiFile(file)
    return mid.length
  except Exception as e:
    print(e)
    return 0
In [ ]:
num_samples = len(train_files)
print(f"Number of training samples: {num_samples}")
total_length = sum([get_midi_len(file) for file in train_files])
print(f"Total length of training samples in ticks: {total_length}")
avg_length = total_length / num_samples
print(f"Avg length of training samples in ticks: {avg_length}")
Number of training samples: 638
Total length of training samples in ticks: 72914.93901779644
Avg length of training samples in ticks: 114.28673827240821

Model: Second Order Markov Chain¶

This model serves as a baseline of comparison for our LSTM model.

Preprocessing¶

Train Midi Tokenizer

In [ ]:
config = TokenizerConfig(num_velocities=1, use_chords=False, use_programs=True)
tokenizer = REMI(config)
tokenizer.train(vocab_size=1000, files_paths=train_files)
tokenizer.save("tokenizer.json")
/usr/local/lib/python3.11/dist-packages/miditok/tokenizations/remi.py:88: UserWarning: Attribute controls are not compatible with 'config.one_token_stream_for_programs' and multi-vocabulary tokenizers. Disabling them from the config.
  super().__init__(tokenizer_config, params)

Construct PyTorch Dataset and Dataloaders

In [ ]:
class MIDIDataset(Dataset):
    def __init__(self, file_paths: List[str], tokenizer):
        self.tokenizer = tokenizer
        self.file_paths = file_paths
    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        midi = Score(self.file_paths[idx])
        tokens = self.tokenizer(midi)
        return np.array(tokens)
In [ ]:
train_dataset = MIDIDataset(train_files, tokenizer)
test_dataset = MIDIDataset(test_files, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

Model¶

In [ ]:
class SecondOrderMarkovChain:
    def __init__(self):
        self.transitions = defaultdict(lambda: defaultdict(int))
        self.probabilities = defaultdict(lambda: defaultdict(float))

    def train(self, train_loader):
        for sequence in train_loader:
            sequence = sequence[0].numpy().astype(int)
            for i in range(len(sequence) - 2):
                state1, state2 = sequence[i], sequence[i + 1]
                next_state = sequence[i + 2]
                self.transitions[(state1, state2)][next_state] += 1

        for (state1, state2), next_states in self.transitions.items():
            total = sum(next_states.values())
            for next_state, count in next_states.items():
                self.probabilities[(state1, state2)][next_state] = count / total
        return self.probabilities

    def generate(self, test_sequence, num_predictions=1):
        test_sequence = test_sequence[0].numpy().astype(int)
        results = [test_sequence[0], test_sequence[1]]
        for i in range(100):
            if (results[-2], results[-1]) not in self.probabilities:
                break
            else:
                probs = self.probabilities[(results[-2], results[-1])]
                states = list(probs.keys())
                probabilities = list(probs.values())
                if not states:
                    break
                try:
                    predictions = np.random.choice(states, size=num_predictions, p=probabilities)
                except:
                    break
                results.append(predictions[0])
        return results

Training¶

In [ ]:
model = SecondOrderMarkovChain()
model.train(train_loader)

predictions = []
for test_sequence in test_loader:
    predictions.append(model.generate(test_sequence))
for i, prediction in enumerate(predictions):
    output_score = tokenizer.decode(torch.Tensor(prediction))
    output_score.dump_midi(f"markov/{i}.mid")

Download Output

In [ ]:
# !zip -r markov.zip ./markov
# files.download("markov.zip")

Model: LSTM¶

This is the primary model I will be exploring

Preprocessing¶

In [ ]:
from miditok.pytorch_data import DatasetMIDI, DataCollator

tokenizer = REMI()  # using defaults parameters (constants.py)
train_dataset = DatasetMIDI(
    files_paths=train_files,
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
test_dataset = DatasetMIDI(
    files_paths=test_files,
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
collator = DataCollator(tokenizer.pad_token_id)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collator)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collator)
In [ ]:
len(train_loader), len(test_loader)
Out[ ]:
(160, 18)

Model¶

In [ ]:
class MusicRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout):
        super(MusicRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        # x: (batch_size, seq_length)
        x = self.embedding(x)  # (batch_size, seq_length, embedding_dim)
        out, hidden = self.rnn(x, hidden)  # out: (batch_size, seq_length, hidden_dim)
        out = self.fc(out)  # (batch_size, seq_length, vocab_size)
        return out, hidden

Training¶

In [ ]:
def train(model, train_loader, val_loader, vocab_size, num_epochs=20, lr=0.001, device='cuda'):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        # --------- Training ---------
        model.train()
        total_train_loss = 0

        for batch in train_loader:
            batch = batch['input_ids'].to(device)  # (batch_size, seq_length)

            inputs = batch[:, :-1]
            targets = batch[:, 1:]

            optimizer.zero_grad()
            outputs, _ = model(inputs)
            outputs = outputs.reshape(-1, vocab_size)
            targets = targets.reshape(-1)

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)

        # --------- Validation ---------
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = batch['input_ids'].to(device)

                inputs = batch[:, :-1]
                targets = batch[:, 1:]

                outputs, _ = model(inputs)
                outputs = outputs.reshape(-1, vocab_size)
                targets = targets.reshape(-1)

                loss = criterion(outputs, targets)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)

        print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")


# Example usage
if __name__ == "__main__":
    vocab_size = tokenizer.vocab_size
    embedding_dim = 256
    hidden_dim = 512
    num_layers = 2
    dropout = .3

    model = MusicRNN(vocab_size, embedding_dim, hidden_dim, num_layers, dropout)
    train(model, train_loader, test_loader, vocab_size, num_epochs=15)
Epoch 1/15 | Train Loss: 2.7964 | Val Loss: 1.9718
Epoch 2/15 | Train Loss: 1.7968 | Val Loss: 1.6201
Epoch 3/15 | Train Loss: 1.5021 | Val Loss: 1.4396
Epoch 4/15 | Train Loss: 1.3268 | Val Loss: 1.3043
Epoch 5/15 | Train Loss: 1.2244 | Val Loss: 1.2479
Epoch 6/15 | Train Loss: 1.1488 | Val Loss: 1.2160
Epoch 7/15 | Train Loss: 1.0998 | Val Loss: 1.1965
Epoch 8/15 | Train Loss: 1.0568 | Val Loss: 1.1939
Epoch 9/15 | Train Loss: 1.0244 | Val Loss: 1.1704
Epoch 10/15 | Train Loss: 0.9711 | Val Loss: 1.1626
Epoch 11/15 | Train Loss: 0.9323 | Val Loss: 1.1570
Epoch 12/15 | Train Loss: 0.8970 | Val Loss: 1.1454
Epoch 13/15 | Train Loss: 0.8553 | Val Loss: 1.1402
Epoch 14/15 | Train Loss: 0.8196 | Val Loss: 1.1539
Epoch 15/15 | Train Loss: 0.7933 | Val Loss: 1.1445

Train 15 Epoch Model¶

In [ ]:
def sample(model, start_token, max_length=100, temperature=1.0, device='cuda'):
    model = model.to(device)
    model.eval()

    generated = [start_token]
    input_token = torch.tensor([[start_token]], device=device)  # (1, 1)

    hidden = None

    for _ in range(max_length):
        output, hidden = model(input_token, hidden)  # output: (1, 1, vocab_size)
        output = output[:, -1, :]  # take the last output
        output = output / temperature  # adjust randomness

        probs = F.softmax(output, dim=-1)  # (1, vocab_size)
        next_token = torch.multinomial(probs, num_samples=1).item()
        generated.append(next_token)
        if next_token == 2 or next_token == 0: # reach end of sequence
          break

        input_token = torch.tensor([[next_token]], device=device)

    return generated

start_token = tokenizer.special_tokens_ids[1]
generated_sequence = sample(model, start_token, max_length=1024)

print("Generated token sequence:")
print(generated_sequence)
Generated token sequence:
[1, 4, 189, 44, 124, 132, 197, 49, 124, 128, 201, 47, 124, 128, 205, 51, 124, 128, 209, 49, 124, 132, 217, 44, 124, 126, 219, 44, 124, 126, 4, 189, 42, 124, 128, 193, 47, 124, 128, 197, 46, 124, 136, 209, 51, 124, 128, 213, 47, 124, 128, 217, 47, 124, 128, 4, 189, 42, 124, 132, 197, 40, 124, 132, 205, 40, 124, 132, 213, 37, 124, 132, 4, 189, 38, 124, 126, 191, 34, 124, 126, 193, 35, 124, 126, 195, 30, 124, 126, 197, 18, 124, 140, 213, 30, 124, 126, 217, 43, 124, 126, 219, 42, 124, 126, 4, 189, 40, 124, 140, 47, 124, 140, 205, 41, 124, 140, 42, 124, 140, 4, 189, 66, 124, 140, 38, 124, 138, 205, 40, 124, 140, 44, 124, 140, 4, 189, 53, 124, 156, 4, 189, 44, 124, 156, 4, 189, 44, 124, 126, 191, 42, 124, 126, 193, 41, 124, 126, 195, 37, 124, 126, 197, 35, 124, 126, 199, 37, 124, 126, 201, 38, 124, 126, 203, 38, 124, 126, 205, 30, 124, 126, 207, 42, 124, 126, 209, 46, 124, 126, 211, 46, 124, 126, 213, 25, 124, 126, 215, 46, 124, 126, 217, 42, 124, 126, 219, 42, 124, 126, 4, 189, 40, 124, 126, 191, 43, 124, 126, 193, 40, 124, 126, 195, 42, 124, 126, 201, 44, 124, 126, 203, 47, 124, 126, 205, 44, 124, 126, 207, 48, 124, 126, 209, 54, 124, 126, 211, 52, 124, 126, 213, 50, 124, 126, 215, 46, 124, 126, 217, 47, 124, 126, 219, 49, 124, 126, 4, 189, 47, 124, 126, 191, 45, 124, 126, 193, 49, 124, 126, 195, 47, 124, 126, 197, 116, 124, 126, 199, 47, 124, 126, 201, 44, 124, 126, 203, 42, 124, 126, 205, 47, 124, 126, 207, 52, 124, 126, 209, 54, 124, 126, 211, 44, 124, 126, 213, 47, 124, 132, 4, 189, 46, 124, 132, 197, 49, 124, 126, 199, 47, 124, 126, 201, 49, 124, 126, 203, 52, 124, 126, 207, 45, 124, 126, 209, 47, 124, 126, 211, 47, 124, 126, 213, 51, 124, 126, 215, 50, 124, 126, 217, 46, 124, 126, 219, 45, 124, 126, 4, 189, 45, 124, 128, 193, 43, 124, 126, 219, 38, 124, 126, 205, 42, 124, 126, 207, 42, 124, 126, 209, 47, 124, 128, 213, 41, 124, 132, 4, 189, 35, 124, 126, 191, 40, 124, 126, 193, 47, 124, 126, 195, 39, 124, 126, 197, 40, 124, 140, 213, 50, 124, 132, 4, 193, 43, 124, 126, 195, 42, 124, 126, 197, 45, 124, 126, 199, 43, 124, 126, 201, 41, 124, 126, 203, 40, 124, 126, 205, 45, 124, 126, 207, 49, 124, 126, 209, 39, 124, 126, 211, 45, 124, 126, 213, 43, 124, 126, 215, 45, 124, 126, 217, 44, 124, 126, 219, 47, 124, 126, 4, 189, 41, 124, 132, 197, 45, 124, 132, 205, 44, 124, 132, 213, 40, 124, 132, 4, 189, 43, 124, 132, 197, 42, 124, 132, 205, 44, 124, 132, 4, 189, 44, 124, 140, 205, 46, 124, 140, 4, 189, 47, 124, 140, 205, 47, 124, 140, 4, 189, 49, 124, 140, 205, 37, 124, 140, 4, 189, 35, 124, 140, 205, 37, 124, 140, 4, 189, 37, 124, 140, 4, 4, 189, 63, 124, 132, 56, 124, 140, 197, 59, 124, 132, 205, 59, 124, 132, 58, 124, 132, 213, 59, 124, 132, 61, 124, 132, 4, 189, 63, 124, 126, 67, 124, 126, 191, 66, 124, 126, 59, 124, 126, 193, 61, 124, 126, 66, 124, 126, 195, 64, 124, 126, 64, 124, 126, 199, 66, 124, 126, 66, 124, 126, 201, 63, 124, 132, 52, 124, 132, 46, 124, 132, 205, 66, 124, 132, 209, 66, 124, 126, 74, 124, 126, 211, 66, 124, 126, 66, 124, 126, 213, 75, 124, 126, 215, 66, 124, 126, 78, 124, 126, 217, 66, 124, 126, 66, 124, 126, 219, 65, 124, 126, 71, 124, 126, 4, 189, 66, 124, 126, 71, 124, 126, 193, 71, 124, 126, 63, 124, 126, 195, 68, 124, 126, 69, 124, 126, 197, 66, 124, 126, 64, 124, 126, 199, 69, 124, 126, 59, 124, 126, 201, 68, 123, 126, 69, 124, 126, 203, 64, 124, 126, 71, 124, 126, 205, 66, 124, 126, 66, 124, 126, 207, 65, 124, 126, 68, 124, 126, 209, 69, 124, 126, 64, 124, 126, 211, 68, 124, 126, 65, 124, 126, 215, 63, 124, 126, 66, 124, 126, 217, 61, 124, 126, 4, 191, 67, 124, 126, 72, 124, 126, 193, 73, 124, 126, 124, 124, 126, 195, 64, 124, 126, 57, 124, 126, 197, 71, 124, 126, 55, 124, 126, 199, 59, 124, 126, 56, 124, 126, 201, 64, 124, 126, 65, 124, 126, 203, 60, 124, 126, 69, 124, 126, 205, 66, 124, 126, 69, 124, 126, 207, 62, 124, 126, 59, 124, 126, 209, 65, 124, 126, 64, 124, 126, 211, 59, 124, 126, 213, 64, 124, 126, 69, 124, 126, 215, 67, 124, 126, 217, 65, 124, 126, 57, 124, 126, 219, 58, 124, 126, 62, 124, 126, 4, 189, 48, 124, 126, 52, 124, 126, 191, 55, 124, 126, 55, 124, 126, 193, 52, 124, 126, 57, 124, 126, 195, 52, 124, 126, 53, 124, 126, 197, 57, 124, 126, 57, 124, 126, 199, 52, 124, 126, 57, 124, 126, 201, 53, 124, 126, 57, 124, 126, 203, 57, 124, 126, 59, 124, 126, 205, 55, 124, 126, 57, 124, 126, 207, 55, 124, 126, 57, 124, 126, 209, 57, 124, 126, 55, 124, 126, 211, 56, 124, 128, 57, 124, 126, 215, 52, 124, 126, 57, 124, 126, 4, 189, 53, 124, 136, 57, 124, 140, 56, 124, 130, 201, 56, 124, 132, 61, 124, 132, 209, 51, 124, 128, 54, 124, 128, 213, 53, 124, 132, 56, 124, 132, 4, 189, 59, 124, 128, 56, 124, 128, 193, 52, 124, 128, 54, 124, 128, 57, 124, 128, 197, 59, 124, 128, 51, 124, 128, 201, 51, 124, 128, 61, 124, 128, 205, 58, 124, 128, 53, 124, 128, 61, 124, 128, 209, 56, 124]
In [ ]:
for i, start_token in enumerate(random.sample(list(tokenizer.vocab.values()), 10)):
  generated_sequence = sample(model, start_token, max_length=1024)
  output_score = tokenizer.tokens_to_midi([generated_sequence])
  output_score.dump_midi(f"rnn_15/rnn_{i}.mid")
<ipython-input-19-2ec83c333e95>:3: UserWarning: miditok: The `tokens_to_midi` method had been renamed `decode`. It is now depreciated and will be removed in future updates.
  output_score = tokenizer.tokens_to_midi([generated_sequence])
In [ ]:
# !zip -r rnn_15.zip ./rnn_15
# files.download("rnn_15.zip")

Train 25 Epoch Model¶

In [ ]:
train(model, train_loader, test_loader, vocab_size, num_epochs=10)
Epoch 1/10 | Train Loss: 0.7644 | Val Loss: 1.1469
Epoch 2/10 | Train Loss: 0.7250 | Val Loss: 1.1617
Epoch 3/10 | Train Loss: 0.6925 | Val Loss: 1.1655
Epoch 4/10 | Train Loss: 0.6596 | Val Loss: 1.1808
Epoch 5/10 | Train Loss: 0.6267 | Val Loss: 1.1996
Epoch 6/10 | Train Loss: 0.6004 | Val Loss: 1.2031
Epoch 7/10 | Train Loss: 0.5703 | Val Loss: 1.2232
Epoch 8/10 | Train Loss: 0.5399 | Val Loss: 1.2488
Epoch 9/10 | Train Loss: 0.5151 | Val Loss: 1.2462
Epoch 10/10 | Train Loss: 0.4844 | Val Loss: 1.2806
In [ ]:
for i, start_token in enumerate(random.sample(list(tokenizer.vocab.values()), 10)):
  generated_sequence = sample(model, start_token, max_length=1024)
  output_score = tokenizer.tokens_to_midi([generated_sequence])
  output_score.dump_midi(f"rnn_25/rnn_{i}.mid")
<ipython-input-23-b088643153c2>:3: UserWarning: miditok: The `tokens_to_midi` method had been renamed `decode`. It is now depreciated and will be removed in future updates.
  output_score = tokenizer.tokens_to_midi([generated_sequence])
In [ ]:
# !zip -r rnn_25.zip ./rnn_25
# files.download("rnn_25.zip")

Train 50 Epoch Model¶

In [ ]:
train(model, train_loader, test_loader, vocab_size, num_epochs=25)
Epoch 1/25 | Train Loss: 0.4660 | Val Loss: 1.3109
Epoch 2/25 | Train Loss: 0.4301 | Val Loss: 1.3256
Epoch 3/25 | Train Loss: 0.4093 | Val Loss: 1.3385
Epoch 4/25 | Train Loss: 0.3873 | Val Loss: 1.3756
Epoch 5/25 | Train Loss: 0.3738 | Val Loss: 1.4017
Epoch 6/25 | Train Loss: 0.3531 | Val Loss: 1.4187
Epoch 7/25 | Train Loss: 0.3376 | Val Loss: 1.4391
Epoch 8/25 | Train Loss: 0.3207 | Val Loss: 1.4662
Epoch 9/25 | Train Loss: 0.3036 | Val Loss: 1.4819
Epoch 10/25 | Train Loss: 0.2872 | Val Loss: 1.5067
Epoch 11/25 | Train Loss: 0.2718 | Val Loss: 1.5175
Epoch 12/25 | Train Loss: 0.2532 | Val Loss: 1.5516
Epoch 13/25 | Train Loss: 0.2442 | Val Loss: 1.5718
Epoch 14/25 | Train Loss: 0.2355 | Val Loss: 1.5953
Epoch 15/25 | Train Loss: 0.2303 | Val Loss: 1.6487
Epoch 16/25 | Train Loss: 0.2297 | Val Loss: 1.6254
Epoch 17/25 | Train Loss: 0.2108 | Val Loss: 1.6725
Epoch 18/25 | Train Loss: 0.1975 | Val Loss: 1.7077
Epoch 19/25 | Train Loss: 0.1855 | Val Loss: 1.7225
Epoch 20/25 | Train Loss: 0.1789 | Val Loss: 1.7439
Epoch 21/25 | Train Loss: 0.1782 | Val Loss: 1.7471
Epoch 22/25 | Train Loss: 0.1725 | Val Loss: 1.7761
Epoch 23/25 | Train Loss: 0.1651 | Val Loss: 1.7997
Epoch 24/25 | Train Loss: 0.1592 | Val Loss: 1.8070
Epoch 25/25 | Train Loss: 0.1531 | Val Loss: 1.8271
In [ ]:
for i, start_token in enumerate(random.sample(list(tokenizer.vocab.values()), 10)):
  generated_sequence = sample(model, start_token, max_length=1024)
  output_score = tokenizer.tokens_to_midi([generated_sequence])
  output_score.dump_midi(f"rnn_50/rnn_{i}.mid")
<ipython-input-26-ce489eda5a3f>:3: UserWarning: miditok: The `tokens_to_midi` method had been renamed `decode`. It is now depreciated and will be removed in future updates.
  output_score = tokenizer.tokens_to_midi([generated_sequence])
In [ ]:
!zip -r rnn_50.zip ./rnn_50
files.download("rnn_50.zip")
  adding: rnn_50/ (stored 0%)
  adding: rnn_50/rnn_5.mid (deflated 60%)
  adding: rnn_50/rnn_0.mid (deflated 67%)
  adding: rnn_50/rnn_1.mid (deflated 63%)
  adding: rnn_50/rnn_3.mid (deflated 59%)
  adding: rnn_50/rnn_2.mid (deflated 79%)
  adding: rnn_50/rnn_6.mid (deflated 66%)
  adding: rnn_50/rnn_4.mid (deflated 64%)
  adding: rnn_50/rnn_7.mid (deflated 62%)
  adding: rnn_50/rnn_8.mid (deflated 68%)
  adding: rnn_50/rnn_9.mid (deflated 70%)
In [ ]: